#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
# @Description 正则清洗文本数据
# @Time : 2020/1/4 15:52 
# @Author : sky 
# @Site :  
# @File : REdealText.py 
# @Software: PyCharm
"""


import re


# 正则对字符串的清洗
def text_parse(text):
    # 正则过滤掉特殊符号，标点，英文，数字等
    reg_1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./:：;；|<=>?@，—。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    # 去除空格
    reg_2 = '\\s+'
    text = re.sub(reg_1, ' ', text)
    text = re.sub(reg_2, ' ', text)

    # 去除换行符
    text = text.replace('\n', '')
    return text


def read_file(path):
    str_doc = ''
    with open(path, encoding='utf-8') as f:
        str_doc = f.read()
    return str_doc


if __name__ == '__main__':
    # 读取文本
    str_doc_path = '../dataset/CSCMNews/体育/0.txt'
    str_doc = read_file(str_doc_path)
    print(str_doc)

    # 数据清洗
    clear_text = text_parse(str_doc)
    print(clear_text)
